library(tidyverse)
library(here)
library(readr)
library(readxl)
library(data.table)
candy <- read_csv(here("clean_data/candy_clean.csv"),
# specifying data types
col_types = cols(gender = col_character(), country = col_character()))
Three seperate Excel files with different formatting
# 2015
"raw_data/boing-boing-candy-2015.xlsx" %>%
here() %>%
read_excel() %>%
data.table()
# 2016
"raw_data/boing-boing-candy-2016.xlsx" %>%
here() %>%
read_excel() %>%
data.table()
# 2017
"raw_data/boing-boing-candy-2017.xlsx" %>%
here() %>%
read_excel() %>%
data.table()
candy %>%
data.table()
What is the total number of candy ratings given across the three years?
candy %>%
# filtering out NAs
filter(!is.na(rating)) %>%
nrow()
## [1] 757108
What was the average age of people who are going out trick or treating and the average age of people not going trick or treating?
candy %>%
group_by(going_out) %>%
# Rounding to whole years
summarise(avg_age = round(mean(age, na.rm = TRUE))) %>%
# Yes first
arrange(desc(going_out))
## `summarise()` ungrouping output (override with `.groups` argument)
For each of joy, despair and meh, which candy bar recieved these ratings the most?
candy %>%
# Filtering out NAs
filter(!is.na(rating)) %>%
group_by(rating, candy) %>%
summarise(
count = n()
) %>%
filter(count == max(count))
## `summarise()` regrouping output by 'rating' (override with `.groups` argument)
How many people rated Starburst as despair?
candy %>%
filter(candy == "starburst") %>%
filter(rating == "despair") %>%
summarise(id_count = n_distinct(person_id))
For the next two questions, count despair as -1, joy as +1 and meh as 0.
candy <-
candy %>%
mutate(rating_numeric = recode(rating,
joy = 1,
despair = -1,
meh = 0))
What was the most popular candy bar by this rating system for each gender in the dataset?
candy %>%
group_by(gender, candy) %>%
summarise(
avg_rating = mean(rating_numeric, na.rm = TRUE)
) %>%
filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'gender' (override with `.groups` argument)
What was the most popular candy bar in each year?
candy %>%
group_by(year, candy) %>%
summarise(avg_rating = mean(rating_numeric, na.rm = TRUE)) %>%
filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
What was the most popular candy bar by this rating for people in US, Canada, UK and all other countries?
candy %>%
mutate(country = if_else(country %in% c("united states",
"canada",
"united kingdom"),
country, "other")) %>%
group_by(country, candy) %>%
summarise(avg_rating = mean(rating_numeric, na.rm = TRUE)) %>%
filter(avg_rating == max(avg_rating))
## `summarise()` regrouping output by 'country' (override with `.groups` argument)